{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# XGBoost Parameter Tuning for Rent Listing Inqueries \n",
    "\n",
    "Rental Listing Inquiries数据集是Kaggle平台上的一个分类竞赛任务，需要根据公寓的特征来预测其受欢迎程度（用户感兴趣程度分为高、中、低三类）。其中房屋的特征x共有14维，响应值y为用户对该公寓的感兴趣程度。评价标准为logloss。\n",
    "数据链接：https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "首先 import 必要的模块"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from xgboost import XGBClassifier\n",
    "import xgboost as xgb\n",
    "\n",
    "import pandas as pd \n",
    "import numpy as np\n",
    "\n",
    "import math\n",
    "\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "from sklearn.metrics import log_loss\n",
    "\n",
    "from matplotlib import pyplot\n",
    "import seaborn as sns\n",
    "%matplotlib inline\n",
    "\n",
    "#dpath = './data/'\n",
    "train = pd.read_csv(\"RentListingInquries_FE_train.csv\")\n",
    "test = pd.read_csv(\"RentListingInquries_FE_test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train = train['interest_level']\n",
    "train = train.drop([\"interest_level\"], axis=1)\n",
    "X_train = np.array(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "xgb5_1 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,\n",
    "       colsample_bytree=0.9, gamma=0, learning_rate=0.1, max_delta_step=0,\n",
    "       max_depth=5, min_child_weight=5, missing=None, n_estimators=251,\n",
    "       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,\n",
    "       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=3, silent=True,\n",
    "       subsample=0.8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "ename": "XGBoostError",
     "evalue": "need to call fit or load_model beforehand",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mXGBoostError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-6-d2f1f44db37c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mxgb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"xgb_model.pkl\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mtrain_predprob\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_proba\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0mlogloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlog_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_predprob\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/APPs/Anaconda3-5.0.1-Linux-x86_64/Anaconda3-5.0.1/lib/python3.6/site-packages/xgboost/sklearn.py\u001b[0m in \u001b[0;36mpredict_proba\u001b[0;34m(self, data, ntree_limit)\u001b[0m\n\u001b[1;32m    630\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mntree_limit\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    631\u001b[0m             \u001b[0mntree_limit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"best_ntree_limit\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 632\u001b[0;31m         class_probs = self.get_booster().predict(test_dmatrix,\n\u001b[0m\u001b[1;32m    633\u001b[0m                                                  ntree_limit=ntree_limit)\n\u001b[1;32m    634\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobjective\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"multi:softprob\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/APPs/Anaconda3-5.0.1-Linux-x86_64/Anaconda3-5.0.1/lib/python3.6/site-packages/xgboost/sklearn.py\u001b[0m in \u001b[0;36mget_booster\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    177\u001b[0m         \"\"\"\n\u001b[1;32m    178\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_Booster\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 179\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mXGBoostError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'need to call fit or load_model beforehand'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    180\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_Booster\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    181\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mXGBoostError\u001b[0m: need to call fit or load_model beforehand"
     ]
    }
   ],
   "source": [
    "import pickle\n",
    "xgb = pickle.load(open(\"xgb_model.pkl\", 'rb'))\n",
    "\n",
    "train_predprob = xgb.predict_proba(X_train)\n",
    "logloss = log_loss(y_train, train_predprob)\n",
    "\n",
    "#Print model report:\n",
    "print( 'logloss of train is:', logloss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "ename": "XGBoostError",
     "evalue": "need to call fit or load_model beforehand",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mXGBoostError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-8-73fe6e351bd5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mxgb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"xgb_model.pkl\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0my_test_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_proba\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mout_df1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_test_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mout_df1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"high\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"medium\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"low\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/APPs/Anaconda3-5.0.1-Linux-x86_64/Anaconda3-5.0.1/lib/python3.6/site-packages/xgboost/sklearn.py\u001b[0m in \u001b[0;36mpredict_proba\u001b[0;34m(self, data, ntree_limit)\u001b[0m\n\u001b[1;32m    630\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mntree_limit\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    631\u001b[0m             \u001b[0mntree_limit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"best_ntree_limit\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 632\u001b[0;31m         class_probs = self.get_booster().predict(test_dmatrix,\n\u001b[0m\u001b[1;32m    633\u001b[0m                                                  ntree_limit=ntree_limit)\n\u001b[1;32m    634\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobjective\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"multi:softprob\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/APPs/Anaconda3-5.0.1-Linux-x86_64/Anaconda3-5.0.1/lib/python3.6/site-packages/xgboost/sklearn.py\u001b[0m in \u001b[0;36mget_booster\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    177\u001b[0m         \"\"\"\n\u001b[1;32m    178\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_Booster\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 179\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mXGBoostError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'need to call fit or load_model beforehand'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    180\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_Booster\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    181\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mXGBoostError\u001b[0m: need to call fit or load_model beforehand"
     ]
    }
   ],
   "source": [
    "xgb = pickle.load(open(\"xgb_model.pkl\", 'rb'))\n",
    "y_test_pred = xgb.predict_proba(test)\n",
    "\n",
    "out_df1 = pd.DataFrame(y_test_pred)\n",
    "out_df1.columns = [\"high\", \"medium\", \"low\"]\n",
    "\n",
    "out_df = pd.concat([test_id,out_df1], axis = 1)\n",
    "out_df.to_csv(\"xgb_Rent.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
